Part 1

Reading the Data

if (!require(tidyverse)) {
  install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'forcats' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
if (!require(data.table)) {
  install.packages("data.table")}
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 4.1.1
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
if (!require(lubridate)) {
  install.packages("lubridate")
}
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.1.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
air_2004 <- data.table::fread("2004_air.csv")
air_2019 <- data.table::fread("2019_air.csv")

Check Dimensions

dim(air_2004)
## [1] 19233    20
dim(air_2019)
## [1] 53086    20

Check Headers and Footers

head(air_2004)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 01/01/2004    AQS 60010007   1                            8.9 ug/m3 LC
## 2: 01/02/2004    AQS 60010007   1                           12.2 ug/m3 LC
## 3: 01/03/2004    AQS 60010007   1                           16.5 ug/m3 LC
## 4: 01/04/2004    AQS 60010007   1                           18.1 ug/m3 LC
## 5: 01/05/2004    AQS 60010007   1                           11.5 ug/m3 LC
## 6: 01/06/2004    AQS 60010007   1                           32.5 ug/m3 LC
##    DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              37 Livermore               1              100
## 2:              51 Livermore               1              100
## 3:              60 Livermore               1              100
## 4:              64 Livermore               1              100
## 5:              48 Livermore               1              100
## 6:              94 Livermore               1              100
##    AQS_PARAMETER_CODE                     AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101               PM2.5 - Local Conditions     41860
## 2:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 3:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 4:              88101               PM2.5 - Local Conditions     41860
## 5:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
## 6:              88502 Acceptable PM2.5 AQI & Speciation Mass     41860
##                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
## 1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
##    SITE_LATITUDE SITE_LONGITUDE
## 1:      37.68753      -121.7842
## 2:      37.68753      -121.7842
## 3:      37.68753      -121.7842
## 4:      37.68753      -121.7842
## 5:      37.68753      -121.7842
## 6:      37.68753      -121.7842
head(air_2019)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 01/01/2019    AQS 60010007   3                            5.7 ug/m3 LC
## 2: 01/02/2019    AQS 60010007   3                           11.9 ug/m3 LC
## 3: 01/03/2019    AQS 60010007   3                           20.1 ug/m3 LC
## 4: 01/04/2019    AQS 60010007   3                           28.8 ug/m3 LC
## 5: 01/05/2019    AQS 60010007   3                           11.2 ug/m3 LC
## 6: 01/06/2019    AQS 60010007   3                            2.7 ug/m3 LC
##    DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              24 Livermore               1              100
## 2:              50 Livermore               1              100
## 3:              68 Livermore               1              100
## 4:              86 Livermore               1              100
## 5:              47 Livermore               1              100
## 6:              11 Livermore               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     41860
## 2:              88101 PM2.5 - Local Conditions     41860
## 3:              88101 PM2.5 - Local Conditions     41860
## 4:              88101 PM2.5 - Local Conditions     41860
## 5:              88101 PM2.5 - Local Conditions     41860
## 6:              88101 PM2.5 - Local Conditions     41860
##                            CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
## 1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
## 6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
##    SITE_LATITUDE SITE_LONGITUDE
## 1:      37.68753      -121.7842
## 2:      37.68753      -121.7842
## 3:      37.68753      -121.7842
## 4:      37.68753      -121.7842
## 5:      37.68753      -121.7842
## 6:      37.68753      -121.7842
tail(air_2004)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 12/14/2004    AQS 61131003   1                             11 ug/m3 LC
## 2: 12/17/2004    AQS 61131003   1                             16 ug/m3 LC
## 3: 12/20/2004    AQS 61131003   1                             17 ug/m3 LC
## 4: 12/23/2004    AQS 61131003   1                              9 ug/m3 LC
## 5: 12/26/2004    AQS 61131003   1                             24 ug/m3 LC
## 6: 12/29/2004    AQS 61131003   1                              9 ug/m3 LC
##    DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              46 Woodland-Gibson Road               1              100
## 2:              59 Woodland-Gibson Road               1              100
## 3:              61 Woodland-Gibson Road               1              100
## 4:              38 Woodland-Gibson Road               1              100
## 5:              76 Woodland-Gibson Road               1              100
## 6:              38 Woodland-Gibson Road               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     40900
## 2:              88101 PM2.5 - Local Conditions     40900
## 3:              88101 PM2.5 - Local Conditions     40900
## 4:              88101 PM2.5 - Local Conditions     40900
## 5:              88101 PM2.5 - Local Conditions     40900
## 6:              88101 PM2.5 - Local Conditions     40900
##                                  CBSA_NAME STATE_CODE      STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
##    COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1:   Yolo      38.66121      -121.7327
## 2:   Yolo      38.66121      -121.7327
## 3:   Yolo      38.66121      -121.7327
## 4:   Yolo      38.66121      -121.7327
## 5:   Yolo      38.66121      -121.7327
## 6:   Yolo      38.66121      -121.7327
tail(air_2019)
##          Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
## 1: 11/11/2019    AQS 61131003   1                           13.5 ug/m3 LC
## 2: 11/17/2019    AQS 61131003   1                           18.1 ug/m3 LC
## 3: 11/29/2019    AQS 61131003   1                           12.5 ug/m3 LC
## 4: 12/17/2019    AQS 61131003   1                           23.8 ug/m3 LC
## 5: 12/23/2019    AQS 61131003   1                            1.0 ug/m3 LC
## 6: 12/29/2019    AQS 61131003   1                            9.1 ug/m3 LC
##    DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1:              54 Woodland-Gibson Road               1              100
## 2:              64 Woodland-Gibson Road               1              100
## 3:              52 Woodland-Gibson Road               1              100
## 4:              76 Woodland-Gibson Road               1              100
## 5:               4 Woodland-Gibson Road               1              100
## 6:              38 Woodland-Gibson Road               1              100
##    AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
## 1:              88101 PM2.5 - Local Conditions     40900
## 2:              88101 PM2.5 - Local Conditions     40900
## 3:              88101 PM2.5 - Local Conditions     40900
## 4:              88101 PM2.5 - Local Conditions     40900
## 5:              88101 PM2.5 - Local Conditions     40900
## 6:              88101 PM2.5 - Local Conditions     40900
##                                  CBSA_NAME STATE_CODE      STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
## 6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
##    COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1:   Yolo      38.66121      -121.7327
## 2:   Yolo      38.66121      -121.7327
## 3:   Yolo      38.66121      -121.7327
## 4:   Yolo      38.66121      -121.7327
## 5:   Yolo      38.66121      -121.7327
## 6:   Yolo      38.66121      -121.7327

Check Variable Names and Variable Types

str(air_2004)
## Classes 'data.table' and 'data.frame':   19233 obs. of  20 variables:
##  $ Date                          : chr  "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
##  $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
##  $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
##  $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Daily Mean PM2.5 Concentration: num  8.9 12.2 16.5 18.1 11.5 32.5 14 29.9 21 15.7 ...
##  $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
##  $ DAILY_AQI_VALUE               : int  37 51 60 64 48 94 55 88 70 59 ...
##  $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
##  $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ AQS_PARAMETER_CODE            : int  88101 88502 88502 88101 88502 88502 88101 88502 88502 88101 ...
##  $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "PM2.5 - Local Conditions" ...
##  $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
##  $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
##  $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ STATE                         : chr  "California" "California" "California" "California" ...
##  $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(air_2019)
## Classes 'data.table' and 'data.frame':   53086 obs. of  20 variables:
##  $ Date                          : chr  "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
##  $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
##  $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
##  $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Daily Mean PM2.5 Concentration: num  5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
##  $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
##  $ DAILY_AQI_VALUE               : int  24 50 68 86 47 11 12 29 13 30 ...
##  $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
##  $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
##  $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
##  $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
##  $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
##  $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ STATE                         : chr  "California" "California" "California" "California" ...
##  $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
##  $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Key Variables

#table(air_2004$Date)
#table(air_2019$Date)
summary(air_2004$`Daily Mean PM2.5 Concentration`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -0.10    6.00   10.10   13.13   16.30  251.00
summary(air_2019$`Daily Mean PM2.5 Concentration`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.200   4.000   6.500   7.734   9.900 120.900
##Checking to see if any NA's exist 
mean(is.na(air_2004$`Daily Mean PM2.5 Concentration`)) 
## [1] 0
mean(is.na(air_2019$`Daily Mean PM2.5 Concentration`))
## [1] 0

Summary of Findings

  • From checks ran on the variables of the datasets, there are not any glaring issues as there are not any NA’s present in the PM2.5 Concentration values. However, the only concering factor would be that there are negative PM2.5 values, which generally isn’t possible unless due to calibrations and such. So for the sake of the analysis, we will remove these negative values but that will be done after we merge the data, as it is simpler to remove from one concantenated dataset rather than two seperate datasets.

Part 2

Combining Data into One Data Frame, Creating Year Column, Changing names of key variables

air_2004_2019 <- rbind(air_2004,air_2019)
#air_2004_2019$Year <- format(as.Date(air_2004_2019$Date, format="%m/%d/%Y"),"%Y")
air_2004_2019 <- mutate(air_2004_2019, year = factor(rep(c(2004, 2019), c(nrow(air_2004), nrow(air_2019)))))
colnames(air_2004_2019)[which(names(air_2004_2019) == "Daily Mean PM2.5 Concentration")] <- "PM2.5"
colnames(air_2004_2019)[which(names(air_2004_2019) == "SITE_LATITUDE")] <- "lat"
colnames(air_2004_2019)[which(names(air_2004_2019) == "SITE_LONGITUDE")] <- "lon"
air_2004_2019 <- air_2004_2019[PM2.5>=0][order(PM2.5)]
#summary(air_2004_2019$PM2.5)
#gplot(air_2004_2019) + 
  #geom_histogram(mapping = aes(x = PM2.5))

#Part 3

#Create Leaflet map showing the locations of the sites

library(leaflet)
## Warning: package 'leaflet' was built under R version 4.1.1
temp.pal <- colorFactor(palette ='Accent', domain=air_2004_2019$year)

leaflet(air_2004_2019) %>%
  addProviderTiles('CartoDB.Positron') %>%
  addCircles(
    lat = ~lat, lng=~lon,
                                                  # HERE IS OUR PAL!
    label = air_2004_2019$`Site Name`, color = ~ temp.pal(air_2004_2019$year),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  # And a pretty legend
  addLegend('bottomleft', pal=temp.pal, values=air_2004_2019$year,
          title='Year', opacity=1)
 # addMarkers(air_2004_2019$lon,air_2004_2019$lat, rank(-air_2004_2019$PM2.5) <= 10)

Summarize the Spatial distribution of the monitoring sites

*From the leaflet map, we can see about an even distribution of sites from the 2004 data set and the 2019 data set. There does seem to be a bit more 2004 monitoring sites in comparison to the the 2019 sites. All of the sites seem to be mainly located on the coast of California rather near the borders of the state. #Part 4

#Check for missing or implausible values of PM2.5 in the combined dataset

#table(air_2004_2019$PM2.5)
summary(air_2004_2019$PM2.5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.400   7.300   9.208  11.300 251.000

Summary of Observations

  • We already removed the implausible values for PM2.5 earlier and we already checked for any missing values, so nothing to really change here. In terms of temporal patters, it seems evident that there is a slight decrease in PM2.5 values from 2004 to the 2019, as the max PM2.5 value decreased by nearly 130ug/m3 LC. Moreover, the 2019 datapoints for PM2.5 seem to have a slight rightward skew more so than the 2004 datapoints for PM2.5 Concentrations.

#Part 5

Explore the main question of interest at three different spatial levels. Create exploratory plots (e.g. boxplots, histograms, line plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.

whether daily concentrations of PM (particulate matter air pollution with aerodynamic diameter less than 2.5 m) have decreased in California over the last 15 years (from 2004 to 2019)

##State

library(ggplot2)
air_2004_2019[!is.na(PM2.5)] %>% 
  ggplot()+
  geom_boxplot(mapping=aes(x=year, y=log2(PM2.5), fill=year)) + labs(title = "PM2.5 Levels in California in 2004 and in 2019") +  labs(x = "Year", y = "log2([PM2.5])")
## Warning: Removed 81 rows containing non-finite values (stat_boxplot).

with(air_2004_2019, tapply(PM2.5, year, summary))
## $`2004`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.00   10.10   13.13   16.30  251.00 
## 
## $`2019`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    4.00    6.50    7.78   10.00  120.90
  • From the graph, we see that there is a decrease in the PM2.5 values over the years from 2004 to 2019 in the State of California, as the median value for Log2(PM2.5) is around 3.2, while for 2019, it is around 2.8. which translates to 10.10ug/m3 LC and 6.58 ug/m3 LC respectively in PM2.5 concetration. We logged the PM2.5 values to account for the right skewdness of the data set which allows us to get a better look at the change in PM2.5 concentration over the years. In both cases, there seems to be a lot of outliers overall, which is to be expected.

County

avg_pm_county <- group_by(air_2004_2019, year, COUNTY) %>% summarize(PM_AVG = mean(PM2.5, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
tail(avg_pm_county)
## # A tibble: 6 x 3
## # Groups:   year [1]
##   year  COUNTY  PM_AVG
##   <fct> <chr>    <dbl>
## 1 2019  Sutter    8.27
## 2 2019  Tehama    5.50
## 3 2019  Trinity   4.94
## 4 2019  Tulare   11.2 
## 5 2019  Ventura   6.67
## 6 2019  Yolo      6.85
temp.pal <- colorFactor(palette ='Accent', domain=avg_pm_county$COUNTY)
ggplot(data=avg_pm_county, aes(x=as.numeric(as.character(year)), y=PM_AVG, group=COUNTY)) +
  geom_line(color=temp.pal(avg_pm_county$COUNTY))+
  geom_point(color=temp.pal(avg_pm_county$COUNTY))+
  labs(title = "Average PM2.5 Levels Between 2004 to 2019 in All Counties") +  labs(x = "Year", y = "Average [PM2.5]") 
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors

* While this graph, isn’t the most descriptive, it does get the point across that the Average PM2.5 concentration for each county decreased over the years.

Sites in Los Angeles

sites <- filter(air_2004_2019, COUNTY_CODE == 37) %>% select(COUNTY_CODE, `Site ID`, year) %>% unique
sites <- mutate(sites, site.code = paste(COUNTY_CODE, `Site ID`, sep = "."))
str(sites)
## Classes 'data.table' and 'data.frame':   24 obs. of  4 variables:
##  $ COUNTY_CODE: int  37 37 37 37 37 37 37 37 37 37 ...
##  $ Site ID    : int  60376012 60379033 60379034 60371602 60379034 60371201 60379033 60374004 60372005 60370002 ...
##  $ year       : Factor w/ 2 levels "2004","2019": 2 2 1 2 2 2 1 2 2 2 ...
##  $ site.code  : chr  "37.60376012" "37.60379033" "37.60379034" "37.60371602" ...
##  - attr(*, ".internal.selfref")=<externalptr>
site.year <- with(sites, split(site.code, year))
both <- intersect(site.year[[1]], site.year[[2]])
print(both)
## [1] "37.60379034" "37.60379033" "37.60370002" "37.60371103" "37.60371201"
## [6] "37.60374004" "37.60372005" "37.60374002"
count <- mutate(air_2004_2019, site.code = paste(COUNTY_CODE, `Site ID`, sep = ".")) %>% filter(site.code %in% both)
group_by(count, site.code) %>% summarize(n = n())
## # A tibble: 8 x 2
##   site.code       n
##   <chr>       <int>
## 1 37.60370002   400
## 2 37.60371103  1401
## 3 37.60371201   586
## 4 37.60372005   288
## 5 37.60374002   484
## 6 37.60374004  1043
## 7 37.60379033   468
## 8 37.60379034   219
#focus on Site ID 60371103
air_60371103 <- filter(air_2004_2019, COUNTY_CODE == "37" & `Site ID` == "60371103") %>% select(Date, year, PM2.5) %>%  mutate(Date = format(as.Date(Date,format="%Y/%m/%d")), yday = format(as.Date(Date,format="%Y/%d")))
qplot(Date, PM2.5, data = air_60371103, facets = . ~ year, xlab = "Day of the year")

* From the graph, we can see that the PM2.5 Concentration did in fact decrease as noted in previous plots. This shows is very clearly as it maps the trends of the PM25 concentration by the days of the year of each year. However, what is interesting, is that while the 2004 points seem to be decreasing over the year, the 2019 dataset seems to be increasing over the year which can be attributed to a multitude of things, namely the rapid increase in fires in California specificly, but more generally, the increase in prescence of things like power plants and automobiles over of the years.